#Warning ignorance if generated
import warnings
warnings.filterwarnings("ignore")
#import necessary python packages for single-cell RNA SEQ analysis
import scanpy as sc #software suite of tools for single-cell analysis in python
import besca as bc #internal BEDA package for single cell analysis
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import scipy
import anndata as ad
from scipy.sparse import csr_matrix
import scanpy.external as sce
from harmony import harmonize
import umap.umap_ as umap
print(ad.__version__)
sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3)
# gives error!! sc.logging.print_versions()
INFO:torch.distributed.nn.jit.instantiator:Created a temporary directory at /tmp/tmpqtu1q5x0 INFO:torch.distributed.nn.jit.instantiator:Writing /tmp/tmpqtu1q5x0/_remote_module_non_scriptable.py INFO:lightning_fabric.utilities.seed:Global seed set to 0
0.9.1
#particular displaying result settings for single cell analysis
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')
scanpy==1.9.3 anndata==0.9.1 umap==0.3.10 numpy==1.23.5 scipy==1.10.1 pandas==2.0.1 scikit-learn==1.2.2 statsmodels==0.14.0 python-igraph==0.10.4 louvain==0.8.0 pynndescent==0.5.10
/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/_settings.py:447: DeprecationWarning: `set_matplotlib_formats` is deprecated since IPython 7.23, directly use `matplotlib_inline.backend_inline.set_matplotlib_formats()`
#loading all dataset into the workspace
#Load Disease PBMC dataset1 for sarcoidosis
sarc1=sc.read_10x_mtx('/raid02/Data-live/tjana/LIB5455299_SAM24412250/outs/filtered_feature_bc_matrix',
var_names='gene_symbols',cache=True)
#Load Disease PBMC dataset2 for sarcoidosis
sarc2=sc.read_10x_mtx('/raid02/Data-live/tjana/LIB5455301_SAM24412252/outs/filtered_feature_bc_matrix',
var_names='gene_symbols',cache=True)
#Load Disease PBMC dataset1 for sarcoidosis
sarc3=sc.read_10x_mtx('/raid02/Data-live/tjana/LIB5455303_SAM24412254/outs/filtered_feature_bc_matrix',
var_names='gene_symbols',cache=True)
#load Healthy PBMC Control1
healthy1=sc.read_10x_mtx('/home/jana/healthypbmc/healthy1/filtered_feature_bc_matrix',
var_names='gene_symbols',cache=True)
#load Healthy PBMC Control2
healthy2=sc.read_10x_mtx('/home/jana/healthypbmc/healthy2/filtered_feature_bc_matrix',
var_names='gene_symbols',cache=True)
#load Healthy PBMC Control3
healthy3=sc.read_10x_mtx('/home/jana/healthypbmc/healthy4/filtered_feature_bc_matrix',
var_names='gene_symbols',cache=True)
... reading from cache file cache/raid02-Data-live-tjana-LIB5455299_SAM24412250-outs-filtered_feature_bc_matrix-matrix.h5ad ... reading from cache file cache/raid02-Data-live-tjana-LIB5455301_SAM24412252-outs-filtered_feature_bc_matrix-matrix.h5ad ... reading from cache file cache/raid02-Data-live-tjana-LIB5455303_SAM24412254-outs-filtered_feature_bc_matrix-matrix.h5ad ... reading from cache file cache/home-jana-healthypbmc-healthy1-filtered_feature_bc_matrix-matrix.h5ad ... reading from cache file cache/home-jana-healthypbmc-healthy2-filtered_feature_bc_matrix-matrix.h5ad ... reading from cache file cache/home-jana-healthypbmc-healthy4-filtered_feature_bc_matrix-matrix.h5ad
#Making all indexes into unique of all samples (Disease SARCOID and Healthy)
#sarcoid
sarc1.var_names_make_unique()
sarc2.var_names_make_unique()
sarc3.var_names_make_unique()
#healthy/control
healthy1.var_names_make_unique()
healthy2.var_names_make_unique()
healthy3.var_names_make_unique()
# Adding some metadata for all samples
sarc1.obs['type']="Sarc"
sarc1.obs['sample']="Sarc-1"
sarc2.obs['type']="Sarc"
sarc2.obs['sample']="Sarc-2"
sarc3.obs['type']="Sarc"
sarc3.obs['sample']="Sarc-3"
healthy1.obs['type']="healthy"
healthy1.obs['sample']="healthy-1"
healthy2.obs['type']="healthy"
healthy2.obs['sample']="healthy-2"
healthy3.obs['type']="healthy"
healthy3.obs['sample']="healthy-3"
# All samples are merged into one object named adata (anndata - Annotated data)
adata = sarc1.concatenate(sarc2, sarc3, healthy1, healthy2, healthy3)
# Deleting individual datasets to save space
del(sarc1, sarc2, sarc3)
del(healthy1, healthy2, healthy3)
/home/jana/my-notebook-venv/lib/python3.8/site-packages/anndata/_core/anndata.py:1755: FutureWarning: The AnnData.concatenate method is deprecated in favour of the anndata.concat function. Please use anndata.concat instead. See the tutorial for concat at: https://anndata.readthedocs.io/en/latest/concatenation.html
# Displaying merged object observation and variables types
print (adata)
print ("------")
#Displaying counts cells sample wise
print(adata.obs['sample'].value_counts())
#Displaying counts total cells counts of healthy and Sarcoid (Sarc)
print ("------")
print(adata.obs['type'].value_counts())
AnnData object with n_obs × n_vars = 74050 × 36601
obs: 'type', 'sample', 'batch'
var: 'gene_ids', 'feature_types'
------
sample
healthy-3 23837
healthy-1 11996
healthy-2 11996
Sarc-2 10029
Sarc-3 8754
Sarc-1 7438
Name: count, dtype: int64
------
type
healthy 47829
Sarc 26221
Name: count, dtype: int64
# Print merged adata var (variable) types
print (adata.var)
| gene_ids | feature_types | |
|---|---|---|
| MIR1302-2HG | ENSG00000243485 | Gene Expression |
| FAM138A | ENSG00000237613 | Gene Expression |
| OR4F5 | ENSG00000186092 | Gene Expression |
| AL627309.1 | ENSG00000238009 | Gene Expression |
| AL627309.3 | ENSG00000239945 | Gene Expression |
| ... | ... | ... |
| AC141272.1 | ENSG00000277836 | Gene Expression |
| AC023491.2 | ENSG00000278633 | Gene Expression |
| AC007325.1 | ENSG00000276017 | Gene Expression |
| AC007325.4 | ENSG00000278817 | Gene Expression |
| AC007325.2 | ENSG00000277196 | Gene Expression |
36601 rows × 2 columns
# Print merged adata obs observation types
print (adata.obs)
| type | sample | batch | |
|---|---|---|---|
| AAACCCAAGACATAAC-1-0 | Sarc | Sarc-1 | 0 |
| AAACCCAAGAGGCGGA-1-0 | Sarc | Sarc-1 | 0 |
| AAACCCAAGCGTACAG-1-0 | Sarc | Sarc-1 | 0 |
| AAACCCAAGGTACAAT-1-0 | Sarc | Sarc-1 | 0 |
| AAACCCACAGCGTACC-1-0 | Sarc | Sarc-1 | 0 |
| ... | ... | ... | ... |
| TTTGTTGGTTCAAGGG-1-5 | healthy | healthy-3 | 5 |
| TTTGTTGTCACCTGGG-1-5 | healthy | healthy-3 | 5 |
| TTTGTTGTCATTGAGC-1-5 | healthy | healthy-3 | 5 |
| TTTGTTGTCCGATGTA-1-5 | healthy | healthy-3 | 5 |
| TTTGTTGTCGTGGCTG-1-5 | healthy | healthy-3 | 5 |
74050 rows × 3 columns
# Identification of mitochondrial genes by giving a pattern 'MT-'
adata.var['mt'] = adata.var_names.str.startswith('MT-')
# Identification ribosomal genes by giving a pattern 'RPS/RPL'
adata.var['ribo'] = adata.var_names.str.startswith(("RPS","RPL"))
# Identification hemoglobin genes by giving a pattern ^HB[^(P)]
adata.var['hb'] = adata.var_names.str.contains(("^HB[^(P)]"))
# Print merged adata varibale
print ("adata variables types including genes_id, feature types, mt, ribo and hb")
print("----------------")
print (adata.var)
adata variables types including genes_id, feature types, mt, ribo and hb
----------------
gene_ids feature_types mt ribo hb
MIR1302-2HG ENSG00000243485 Gene Expression False False False
FAM138A ENSG00000237613 Gene Expression False False False
OR4F5 ENSG00000186092 Gene Expression False False False
AL627309.1 ENSG00000238009 Gene Expression False False False
AL627309.3 ENSG00000239945 Gene Expression False False False
... ... ... ... ... ...
AC141272.1 ENSG00000277836 Gene Expression False False False
AC023491.2 ENSG00000278633 Gene Expression False False False
AC007325.1 ENSG00000276017 Gene Expression False False False
AC007325.4 ENSG00000278817 Gene Expression False False False
AC007325.2 ENSG00000277196 Gene Expression False False False
[36601 rows x 5 columns]
#Calculating the QC metrices of merged object adata
print ("......QC metrices....calculating")
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt','ribo','hb'], percent_top=None, log1p=False, inplace=True)
print ("......QC metrices....calculating done")
......QC metrices....calculating ......QC metrices....calculating done
#Now you can see that we have additional data in the scanpy obs slot.
print("Computing cell compute fraction of counts in mito genes vs. all genes")
mito_genes = adata.var_names.str.startswith('MT-')
# for each cell compute fraction of counts in mito genes vs. all genes
# the `.A1` is only necessary as X is sparse (to transform to a dense array after summing)
adata.obs['percent_mt2'] = np.sum(
adata[:, mito_genes].X, axis=1).A1 / np.sum(adata.X, axis=1).A1
# Adding the total counts per cell as observations-annotation to adata
adata.obs['n_counts'] = adata.X.sum(axis=1).A1
print ("computing done")
Computing cell compute fraction of counts in mito genes vs. all genes computing done
#Violin plot of whole QC metrices of merged object in samples wise
print ("Pre QC metrices")
sc.pl.violin(adata, ['total_counts', 'n_genes_by_counts', 'pct_counts_mt','pct_counts_ribo', 'pct_counts_hb'],
jitter=0.4, groupby = 'sample', rotation= 45, multi_panel=True)
Pre QC metrices
#Scatter plot: (total counts vs. pct_counts_mt) & (total counts vs n_genes_by_counts): sample wise
print ("Scatter plots")
sc.pl.scatter(adata, x='total_counts', y='pct_counts_mt', color="sample")
sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts', color="sample")
Scatter plots
/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_anndata.py:490: MatplotlibDeprecationWarning: The legendHandles attribute was deprecated in Matplotlib 3.7 and will be removed two minor releases later. Use legend_handles instead.
/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_anndata.py:490: MatplotlibDeprecationWarning: The legendHandles attribute was deprecated in Matplotlib 3.7 and will be removed two minor releases later. Use legend_handles instead.
# Basic Filtering with minimum number of cells and genes
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=3)
# Displaying Number of observations and Number of variables/features
print ("adata contains observations and variables")
print(adata.n_obs, adata.n_vars)
adata contains observations and variables 73171 26137
# Fixing cutoff for mt and ribo for percent mito
print ("Fixing cutoff for mt and ribo for post processing")
adata = adata[adata.obs['pct_counts_mt'] < 20, :]
# filter for percent ribo > 0.05
adata = adata[adata.obs['pct_counts_ribo'] > 5, :]
print("Remaining cells %d"%adata.n_obs)
Fixing cutoff for mt and ribo for post processing Remaining cells 71184
#After fitering mito and ribo part, the Violin plot of whole QC metrices of merged object in samples wise
print ("final QC cutoffs for post processing")
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt','pct_counts_ribo', 'pct_counts_hb'],
jitter=0.4, groupby = 'sample', rotation = 45)
#top 20 Highest expressing genes
print ("Highest expressed genes post QC cutoff chosen")
sc.pl.highest_expr_genes(adata, n_top=20)
Highest expressed genes post QC cutoff chosen
/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/preprocessing/_normalization.py:170: UserWarning: Received a view of an AnnData. Making a copy.
normalizing counts per cell
finished (0:00:01)
# we need to redefine the mito_genes,hb_genes, since they were first
# calculated on the full object before removing low expressed genes.
print ("Removing some unwanted genes named MALAT1")
malat1 = adata.var_names.str.startswith('MALAT1')
mito_genes = adata.var_names.str.startswith('MT-')
hb_genes = adata.var_names.str.contains('^HB[^(P)]')
remove = np.add(mito_genes, malat1)
remove = np.add(remove, hb_genes)
keep = np.invert(remove)
adata = adata[:,keep]
print(adata.n_obs, adata.n_vars)
Removing some unwanted genes named MALAT1 71184 26113
#top 20 Highest expressing genes
print ("After removing some unwanted genes, current top 20 highest expressed genes")
sc.pl.highest_expr_genes(adata, n_top=20)
After removing some unwanted genes, current top 20 highest expressed genes
/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/preprocessing/_normalization.py:170: UserWarning: Received a view of an AnnData. Making a copy.
normalizing counts per cell
finished (0:00:01)
# adata raw is assigned with adata for post processing
adata.raw = adata
#Doublet detetection
print ("Doublet detection")
import scrublet as scr
# split per batch into new objects.
batches = adata.obs['sample'].cat.categories.tolist()
alldata = {}
for batch in batches:
tmp = adata[adata.obs['sample'] == batch,]
print(batch, ":", tmp.shape[0], " cells")
scrub = scr.Scrublet(tmp.raw.X)
out = scrub.scrub_doublets(verbose=False, n_prin_comps = 20)
alldata[batch] = pd.DataFrame({'doublet_score':out[0],'predicted_doublets':out[1]},index = tmp.obs.index)
print(alldata[batch].predicted_doublets.sum(), " predicted_doublets")
Doublet detection Sarc-1 : 6854 cells 294 predicted_doublets Sarc-2 : 9684 cells 628 predicted_doublets Sarc-3 : 8238 cells 448 predicted_doublets healthy-1 : 11654 cells 671 predicted_doublets healthy-2 : 11654 cells 671 predicted_doublets healthy-3 : 23100 cells 1126 predicted_doublets
# Doublet detector predictions adding to the adata object.
scrub_pred = pd.concat(alldata.values())
adata.obs['doublet_scores'] = scrub_pred['doublet_score']
adata.obs['predicted_doublets'] = scrub_pred['predicted_doublets']
sum(adata.obs['predicted_doublets'])
3838
# add in column with singlet/doublet instead of True/Fals
%matplotlib inline
adata.obs['doublet_info'] = adata.obs["predicted_doublets"].astype(str)
sc.pl.violin(adata, 'n_genes_by_counts',
jitter=0.4, groupby = 'doublet_info', rotation=45)
... storing 'doublet_info' as categorical
#sandard Pipeline used here
print ("Standard pipeline SC RNA seq")
#Normalize the data
sc.pp.normalize_total(adata)
#Logarithmize the data
sc.pp.log1p(adata)
#Identify highly-variable genes
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
#displaying Highly variable genes before normalize and after normalization
sc.pl.highly_variable_genes(adata)
#Getting back to an AnnData of the object in .raw by calling .raw.to_adata().
adata.raw = adata
#filtering highly variable genes
adata = adata[:, adata.var.highly_variable]
#Regressing out effects of total counts per cell and the percentage of mitochondrial genes expressed
sc.pp.regress_out(adata, ['total_counts', 'pct_counts_mt'])
#Scale the data to unit variance and Clip values exceeding standard deviation 10
sc.pp.scale(adata, max_value=10)
#Reduce the dimensionality of the data by running principal component analysis (PCA). Denoising the data
sc.tl.pca(adata, svd_solver='arpack')
#scatter plot generation in the PCA coordinates
sc.pl.pca(adata, color='CST3')
#PCA total variance data
from numba import njit
sc.pl.pca_variance_ratio(adata, log=True)
Standard pipeline SC RNA seq
normalizing counts per cell
finished (0:00:01)
extracting highly variable genes
finished (0:00:06)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
regressing out ['total_counts', 'pct_counts_mt']
sparse input is densified and may lead to high memory use
finished (0:03:53)
computing PCA
on highly variable genes
with n_comps=50
finished (0:00:13)
/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:163: MatplotlibDeprecationWarning: The get_cmap function was deprecated in Matplotlib 3.7 and will be removed two minor releases later. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap(obj)`` instead.
#Computing the neighborhood graph
#Neighborhood graph of cells using the PCA representation of the data matrix computation.
print ("Computing neighborhood graphs and Clustering")
sc.pp.neighbors(adata, n_neighbors=10, n_pcs=50)
#Clustering the neighborhood graph using Leiden Clustering algorithm
sc.tl.leiden(adata)
sc.tl.umap(adata)
#PLoting UMAP with clusters using Leiden algorithm
sc.pl.umap(adata, color=['leiden', 'CST3', 'NKG7'])
Computing neighborhood graphs and Clustering
computing neighbors
using 'X_pca' with n_pcs = 50
/home/jana/my-notebook-venv/lib/python3.8/site-packages/numba/core/typed_passes.py:334: NumbaPerformanceWarning:
The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.
To find out why, try turning on parallel diagnostics, see https://numba.readthedocs.io/en/stable/user/parallel.html#diagnostics for help.
File "my-notebook-venv/lib/python3.8/site-packages/umap/rp_tree.py", line 135:
@numba.njit(fastmath=True, nogil=True, parallel=True)
def euclidean_random_projection_split(data, indices, rng_state):
^
/home/jana/my-notebook-venv/lib/python3.8/site-packages/umap/nndescent.py:91: NumbaPerformanceWarning:
The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.
To find out why, try turning on parallel diagnostics, see https://numba.readthedocs.io/en/stable/user/parallel.html#diagnostics for help.
File "my-notebook-venv/lib/python3.8/site-packages/umap/utils.py", line 409:
@numba.njit(parallel=True)
def build_candidates(current_graph, n_vertices, n_neighbors, max_candidates, rng_state):
^
/home/jana/my-notebook-venv/lib/python3.8/site-packages/numba/core/typed_passes.py:334: NumbaPerformanceWarning:
The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.
To find out why, try turning on parallel diagnostics, see https://numba.readthedocs.io/en/stable/user/parallel.html#diagnostics for help.
File "my-notebook-venv/lib/python3.8/site-packages/umap/nndescent.py", line 47:
@numba.njit(parallel=True)
def nn_descent(
^
finished: added to `.uns['neighbors']`
`.obsp['distances']`, distances for each pair of neighbors
`.obsp['connectivities']`, weighted adjacency matrix (0:00:34)
running Leiden clustering
finished: found 37 clusters and added
'leiden', the cluster labels (adata.obs, categorical) (0:00:11)
computing UMAP
finished: added
'X_umap', UMAP coordinates (adata.obsm) (0:01:31)
/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:163: MatplotlibDeprecationWarning: The get_cmap function was deprecated in Matplotlib 3.7 and will be removed two minor releases later. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap(obj)`` instead. /home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:392: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
#Displaying Doublet scores, Doublet info and Sample wise distrubtion
print ("Doublet finding plots with scores and info across the samples")
sc.pl.umap(adata, color=['doublet_scores','doublet_info','sample'])
Doublet finding plots with scores and info across the samples
/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:163: MatplotlibDeprecationWarning: The get_cmap function was deprecated in Matplotlib 3.7 and will be removed two minor releases later. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap(obj)`` instead. /home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:392: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored /home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:392: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
#Doublet removing and Rest samples for post processing
print ("Considering the False Doublet finding information, means we are considering non doublet portion for the rest of tha analysis")
# also revert back to the raw counts as the main matrix in adata
adata = adata.raw.to_adata()
adata = adata[adata.obs['doublet_info'] == 'False',:]
print ("Current shape of the data")
print(adata.shape)
Considering the False Doublet finding information, means we are considering singlet portion for the rest of tha analysis Current shape of the data (67346, 26113)
#umap of the current data after the doublet removal
sc.pl.umap(adata, color=['sample','leiden'])
/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:163: MatplotlibDeprecationWarning: The get_cmap function was deprecated in Matplotlib 3.7 and will be removed two minor releases later. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap(obj)`` instead. /home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:392: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored /home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:392: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
#BatchCorrection is done using Harmony
print ("Batch correction using Harmony algorithm")
sce.pp.harmony_integrate(adata, 'sample')
2023-11-08 14:05:27,443 - harmonypy - INFO - Computing initial centroids with sklearn.KMeans... INFO:harmonypy:Computing initial centroids with sklearn.KMeans...
Batch correction using Harmony algorithm
2023-11-08 14:06:03,324 - harmonypy - INFO - sklearn.KMeans initialization complete. INFO:harmonypy:sklearn.KMeans initialization complete. 2023-11-08 14:06:04,121 - harmonypy - INFO - Iteration 1 of 10 INFO:harmonypy:Iteration 1 of 10 2023-11-08 14:06:51,663 - harmonypy - INFO - Iteration 2 of 10 INFO:harmonypy:Iteration 2 of 10 2023-11-08 14:07:37,956 - harmonypy - INFO - Iteration 3 of 10 INFO:harmonypy:Iteration 3 of 10 2023-11-08 14:08:25,802 - harmonypy - INFO - Iteration 4 of 10 INFO:harmonypy:Iteration 4 of 10 2023-11-08 14:09:12,448 - harmonypy - INFO - Converged after 4 iterations INFO:harmonypy:Converged after 4 iterations
#Post proccssing of Harmonization
print ("Post proccssing of Harmonization")
#current PCA is aligned to Harmonized PCA values
adata.obsm['X_pca'] = adata.obsm['X_pca_harmony']
#Computing neighborhood graphs and Clustering
print ("Computing neighborhood graphs and Clustering")
sc.pp.neighbors(adata, n_neighbors=10, n_pcs=50)
#Clustering the neighborhood graph using Leiden Clustering algorithm
sc.tl.leiden(adata)
sc.tl.umap(adata)
Post proccssing of Harmonization
Computing neighborhood graphs and Clustering
computing neighbors
using 'X_pca' with n_pcs = 50
/home/jana/my-notebook-venv/lib/python3.8/site-packages/numba/core/typed_passes.py:334: NumbaPerformanceWarning:
The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.
To find out why, try turning on parallel diagnostics, see https://numba.readthedocs.io/en/stable/user/parallel.html#diagnostics for help.
File "my-notebook-venv/lib/python3.8/site-packages/umap/nndescent.py", line 47:
@numba.njit(parallel=True)
def nn_descent(
^
finished: added to `.uns['neighbors']`
`.obsp['distances']`, distances for each pair of neighbors
`.obsp['connectivities']`, weighted adjacency matrix (0:00:21)
running Leiden clustering
finished: found 25 clusters and added
'leiden', the cluster labels (adata.obs, categorical) (0:00:13)
computing UMAP
finished: added
'X_umap', UMAP coordinates (adata.obsm) (0:01:27)
#UMAP cluster view Sample wise and cluster wise
sc.pl.umap(adata, color=['sample', 'leiden'])
/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:163: MatplotlibDeprecationWarning: The get_cmap function was deprecated in Matplotlib 3.7 and will be removed two minor releases later. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap(obj)`` instead. /home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:392: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored /home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:392: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
#display anndata
adata
AnnData object with n_obs × n_vars = 67346 × 26113
obs: 'type', 'sample', 'batch', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'pct_counts_hb', 'percent_mt2', 'n_counts', 'n_genes', 'doublet_scores', 'predicted_doublets', 'doublet_info', 'leiden'
var: 'gene_ids', 'feature_types', 'mt', 'ribo', 'hb', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
uns: 'sample_colors', 'doublet_info_colors', 'log1p', 'hvg', 'pca', 'neighbors', 'leiden', 'umap', 'leiden_colors'
obsm: 'X_pca', 'X_umap', 'X_pca_harmony'
obsp: 'distances', 'connectivities'
#import scipy io package
from scipy import io
save_file = '/home/jana/scanpy_qc_filtered_pbmcs_for_sarcoid.h5ad'
adata.write_h5ad(save_file)